library(readr)
library(dplyr)
library(highcharter)
library(ggplot2)
library(ngram)
library(tidyr)
setwd("/Users/Apple/Documents/TaraFiles/University/term 8/Data Analysis/week_10/hw_10/")
Ten poorest countries are found by their GDP per capita (current US$)“. The percentages of poor people are found by”Poverty headcount ratio at 3.20 a day (2011 PPP) (% of population)“.
WDICountry_Series=read_csv("./WDI_csv/WDICountry-Series.csv")
WDICountry=read_csv("./WDI_csv/WDICountry.csv")
WDIData=read_csv("./WDI_csv/WDIData.csv")
WDIFootNote=read_csv("./WDI_csv/WDIFootNote.csv")
WDISeries_Time=read_csv("./WDI_csv/WDISeries-Time.csv")
WDISeries=read_csv("./WDI_csv/WDISeries.csv")
WDIData$`Indicator Name`%>%unique()%>%as.data.frame()->a
WDIData%>%filter(`Indicator Name`=="GDP per capita (current US$)" |
`Indicator Name`=="Poverty headcount ratio at $3.20 a day (2011 PPP) (% of population)" |
`Indicator Name`=="Poverty headcount ratio at $3.20 a day (2011 PPP) (% of population)") %>%
select(`Country Name`,`Country Code`,`2011`,`Indicator Name`) -> WDIData_q1
WDIData_q1%>%filter(`Indicator Name`=="GDP per capita (current US$)")%>%arrange(`2011`)%>%head(n=10L)
## # A tibble: 10 x 4
## `Country Name` `Country Code` `2011` `Indicator Name`
## <chr> <chr> <dbl> <chr>
## 1 Burundi BDI 260. GDP per capita (current…
## 2 Ethiopia ETH 355. GDP per capita (current…
## 3 Congo, Dem. Rep. COD 357. GDP per capita (current…
## 4 Niger NER 376. GDP per capita (current…
## 5 Liberia LBR 380. GDP per capita (current…
## 6 Sierra Leone SLE 445. GDP per capita (current…
## 7 Madagascar MDG 455. GDP per capita (current…
## 8 Central African Republic CAF 494. GDP per capita (current…
## 9 Malawi MWI 512. GDP per capita (current…
## 10 Gambia, The GMB 514. GDP per capita (current…
WDIData_q1%>%filter(`Indicator Name`=="GDP per capita (current US$)")%>%arrange(`2011`)%>%
select(`Country Name`)%>% .[1:10,] ->poorCOUNTRYcode
WDIData%>%filter(`Country Name` %in% poorCOUNTRYcode$`Country Name`)%>%
filter(`Indicator Name`=="Poverty headcount ratio at national poverty lines (% of population)")%>%
select(`Country Name`,`2008`,`2010`,`2011`,`2012`,`2014`)->Q1_poverty
MEAN=c(Q1_poverty[1,6],Q1_poverty[2,2],Q1_poverty[3,5],Q1_poverty[4,3],
Q1_poverty[5,3],Q1_poverty[6,6],Q1_poverty[7,5],Q1_poverty[8,3],
Q1_poverty[9,4],Q1_poverty[10,4])
Q1_poverty$value=as.numeric(MEAN)
Q1_poverty=Q1_poverty%>%select(`Country Name`,value)%>%mutate(type="poor percent")
WDIData%>%filter(`Country Name` %in% poorCOUNTRYcode$`Country Name`)%>%
filter(`Indicator Name`=="Life expectancy at birth, total (years)")%>%
select(`Country Name`,value=`2011`)%>%mutate(type="life_expectancy")->Q1_life
WDIData%>%filter(`Country Name` %in% poorCOUNTRYcode$`Country Name`)%>%
filter(`Indicator Name`=="Adjusted net national income per capita (current US$)")%>%
select(`Country Name`,value=`2011`)%>%mutate(type="AVGincome")->Q1_income
rbind(Q1_income,Q1_life,Q1_poverty)->q1
p1=ggplot(q1,aes(x=`Country Name`,y=value))+geom_bar(aes(fill = type),stat = "identity",position = "dodge",linetype = "dashed",color = "red",alpha=0.5)+
xlab("country name")+ylab("value")+ggtitle("Q1")+theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1))
p1
***
The Rwandan genocide, also known as the genocide against the Tutsi, was a genocidal mass slaughter of Tutsi in Rwanda by members of the Hutu majority government. An estimated 500,000 to 1,000,000 Rwandans were killed during the 100-day period from 7 April to mid-July 1994,constituting as many as 70% of the Tutsi population. (from wikipedia)
WDIData%>%filter(`Indicator Name`=="Life expectancy at birth, total (years)")->q2_life
q2_life=q2_life[,-c(1,3,4)]
q2_life=q2_life[,c(1,33:47)]%>%na.omit()
q2_life%>%filter(`Country Code`=="RWA")->RWA
RWA[,-1]%>%as.numeric()->RWA_pop
year=as.character(c(1991:2005))
data.frame(pop=RWA_pop,year=as.character(year))->RWA_data
rbind(q2_life%>%select(pop=`1991`)%>%mutate(year="1991"),
q2_life%>%select(pop=`1992`)%>%mutate(year="1992"),
q2_life%>%select(pop=`1993`)%>%mutate(year="1993"),
q2_life%>%select(pop=`1994`)%>%mutate(year="1994"),
q2_life%>%select(pop=`1995`)%>%mutate(year="1995"),
q2_life%>%select(pop=`1996`)%>%mutate(year="1996"),
q2_life%>%select(pop=`1997`)%>%mutate(year="1997"),
q2_life%>%select(pop=`1998`)%>%mutate(year="1998"),
q2_life%>%select(pop=`1999`)%>%mutate(year="1999"),
q2_life%>%select(pop=`2000`)%>%mutate(year="2000"),
q2_life%>%select(pop=`2001`)%>%mutate(year="2001"),
q2_life%>%select(pop=`2002`)%>%mutate(year="2002"),
q2_life%>%select(pop=`2003`)%>%mutate(year="2003"),
q2_life%>%select(pop=`2004`)%>%mutate(year="2004"),
q2_life%>%select(pop=`2005`)%>%mutate(year="2005"))->q2_life_box
hcboxplot(x = q2_life_box$pop, var = q2_life_box$year,outliers=F)%>%
hc_add_series(RWA_data, "spline", hcaes(y = pop,x=year), name = "Rwanda")%>%
hc_title(text="Rwanda Vs other countries")
When countries are sorted by their life expectancy in 2011, their health expenditure per capita has a decreasing trend. The higher the health expenditure per capita, the higher is their life expectancy.
# Q3
WDIData%>%filter(`Indicator Name`=="Life expectancy at birth, total (years)")%>%
select(`Country Name`,valueL=`2011`)->Q3_life
WDIData%>%filter(`Indicator Name`=="Current health expenditure per capita (current US$)")%>%
select(`Country Name`,valueH=`2011`)->Q3_health
Q3_health$valueH=Q3_health$valueH/100
full_join(Q3_life,Q3_health,by="Country Name")%>%na.omit()%>%
arrange(-valueL)%>%mutate(num=row_number())->Q3
rbind(Q3%>%select(num,value=valueH)%>%mutate(type="health expenditure in 100$"),
Q3%>%select(num,value=valueL)%>%mutate(type="Life expectancy"))->Q3_plot
p3=ggplot(Q3_plot,aes(x=num,y=value))+geom_smooth(aes(color =type))+
xlab("country")+ylab("value")+ggtitle("Q3")+theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1))
p3
***
Roughly speaking, there has been an increasing trend in the last 50 years. However in the last 10 years it has generally decreased.
WDIData%>%filter(`Country Name`=="Iran, Islamic Rep.")%>%
filter(`Indicator Name`=="Household final consumption expenditure per capita (constant 2010 US$)")->ir_h
ir_h=ir_h[,-c(1:4)]
data.frame(power=as.numeric(ir_h[-c(58,59)]),year=c(1960:2016))->q4
p4=ggplot(q4,aes(x=year,y=power))+geom_line()+
xlab("year")+ylab("Household final consumption expenditure per capita")+ggtitle("Q4")+
theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1))
p4
a[sort(c(624,328,1546,1551,195,684,522,515,1060,573,677,71,517,511,69,616, 691 ,935)),]->index
as.data.frame(index)->index
WDIData%>%filter(`Country Name`=="Iran, Islamic Rep." )%>%
filter(`Indicator Name`%in%index$index)%>%arrange(`Indicator Name`)->iran
WDIData%>%filter(`Country Name`=="World")%>%
filter(`Indicator Name`%in%index$index)%>%arrange(`Indicator Name`)->WLD
WDIData%>%filter(`Country Name`=="Iran, Islamic Rep." | `Country Name`=="World")%>%
filter(`Indicator Name`=="GDP per capita (constant 2010 US$)")->GDP
iran_GDP=data.frame(data=as.numeric(GDP[2,5:62]),year=1960:2017)
iran_GDP$data=iran_GDP$data/1000
WLD_GDP=data.frame(data=as.numeric(GDP[1,5:62]),year=1960:2017)
WLD_GDP$data=WLD_GDP$data/1000
lst1 = list()
for (i in 1:18) {
data.frame(value=as.numeric(iran[i,5:62]),year=1960:2017)->x
data.frame(value=as.numeric(WLD[i,5:62]),year=1960:2017)->x1
hchart(x,"line", hcaes(y = value, x = year),name= concatenate("IRR ",as.character(iran[i,3])))%>%
hc_add_series(x1, "spline", hcaes(y = value, x = year), name = concatenate("WLD ",as.character(iran[i,3])))%>%
hc_title(text =as.character(iran[i,3]))%>%
hc_add_theme(hc_theme_sandsignika())->hc1
lst1[[i]] <- hc1
}
hchart(iran_GDP, "spline", hcaes(y = data, x = year), name = "IRR GDP per capita in 1000$")%>%
hc_add_series(WLD_GDP, "spline", hcaes(y = data, x = year), name = "WLD GDP per capita in 1000$")%>%
hc_title(text ="GDP")%>%
hc_add_theme(hc_theme_sandsignika())->hc1
lst1[[19]] <- hc1
htmltools::tagList(lst1)
WDIData%>%filter(`Indicator Name`%in%index$index)%>%arrange(`Country Code`,`Indicator Name`)%>%
select(code=`Country Code`,data=`2005`,`Indicator Name`)%>%group_by(code)%>%
mutate(indicator=row_number(code))%>%select(code,indicator,data)->q6
codeList=unique(q6$code)
#264
q6%>%filter(code==codeList[1])%>%arrange(indicator)->x
w=t(as.table(x$data))
for (i in 2:264) {
q6%>%filter(code==codeList[i])%>%arrange(indicator)->x
w1=t(as.table(x$data))
w=rbind(w,w1)
}
w=as.data.frame(w)
Q6_data=cbind(as.data.frame(codeList),w)
Q6_data[111,"N"]=Q6_data[258,"N"]
Q6_data%>%drop_na()->Q6_data
kcl = kmeans(Q6_data[,-1],centers = 3)
kcl
## K-means clustering with 3 clusters of sizes 7, 18, 27
##
## Cluster means:
## A B C D E F G
## 1 1.584334 2.749674 2.416700 1758.9597 8.931237 3.927011 3.4174204
## 2 1.798439 1.584434 2.530723 3797.0983 7.431177 6.133619 9.1851387
## 3 1.942006 10.277547 7.548247 262.3142 8.780996 4.663879 0.8680279
## H I J K L M N
## 1 85.68544 12.25412 12.987823 15775.728 25.20177 2.469418 4.302026
## 2 74.90950 12.62080 21.136976 25308.283 27.49952 1.858166 10.636841
## 3 74.09277 14.48245 8.944539 3430.144 32.63414 5.958033 2.929275
## O P Q R
## 1 2.377591 97.59748 9.679224 8.275978
## 2 1.398093 72.27502 6.559527 6.186059
## 3 1.997948 95.49833 10.648896 8.977103
##
## Clustering vector:
## 9 12 16 20 24 28 35 36 52 57 64 66 67 70 72 74 76 80
## 3 2 2 3 3 3 3 2 1 2 1 3 2 3 1 2 2 2
## 88 94 100 110 111 114 115 118 138 142 149 166 169 175 176 180 183 185
## 1 2 3 2 3 1 2 2 3 3 3 3 2 2 2 2 3 3
## 186 189 193 197 200 203 210 221 222 230 239 242 247 249 258 262
## 3 3 1 2 3 3 3 1 2 3 3 3 3 3 3 3
##
## Within cluster sum of squares by cluster:
## [1] 54023434 423536041 133751826
## (between_SS / total_SS = 89.8 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
plot(Q6_data,col = kcl$cluster,pch = 20,cex = 2)
Q6=data.frame(name=Q6_data[,1],cluster=kcl$cluster)
ir_cls_num=Q6$cluster[which(as.character(Q6$name)=="IRN")]
Q6%>%filter(cluster==ir_cls_num)->IRN_clus
WDIData%>%filter(`Country Code`%in% IRN_clus$name)%>%select(`Country Name`)%>%unique()->IRN_clus_names
print(IRN_clus_names)
## # A tibble: 27 x 1
## `Country Name`
## <chr>
## 1 Central Europe and the Baltics
## 2 Europe & Central Asia (IDA & IBRD countries)
## 3 Lower middle income
## 4 South Asia
## 5 South Asia (IDA & IBRD)
## 6 World
## 7 Armenia
## 8 Belarus
## 9 Brazil
## 10 Bulgaria
## # ... with 17 more rows
Most of countries are in one cluster, and only a small portion of them are in other 2. The clustring result is satisfying.
# Q7
library(ggbiplot)
Q7_pca = prcomp(Q6_data[,-1], scale. = TRUE)
ggbiplot(Q7_pca, obs.scale = 1, var.scale = 1,
groups =as.character( kcl$cluster), ellipse = TRUE, circle = F) +
scale_color_discrete(name = '') +
theme(legend.direction = 'horizontal', legend.position = 'top')
PCA = Q7_pca$sdev^2
qcc::pareto.chart(PCA)
##
## Pareto chart analysis for PCA
## Frequency Cum.Freq. Percentage Cum.Percent.
## A 5.25846322 5.25846322 29.21368457 29.21368457
## B 2.21799589 7.47645911 12.32219939 41.53588395
## C 1.95400125 9.43046036 10.85556248 52.39144643
## D 1.53212686 10.96258722 8.51181590 60.90326233
## E 1.30165405 12.26424127 7.23141138 68.13467371
## F 1.13687394 13.40111521 6.31596634 74.45064005
## G 0.91562860 14.31674381 5.08682554 79.53746559
## H 0.72901719 15.04576100 4.05009551 83.58756111
## I 0.63998675 15.68574775 3.55548196 87.14304307
## J 0.62145722 16.30720497 3.45254012 90.59558319
## K 0.48582936 16.79303433 2.69905198 93.29463517
## L 0.36092562 17.15395995 2.00514232 95.29977749
## M 0.25461248 17.40857243 1.41451379 96.71429128
## N 0.24191418 17.65048661 1.34396766 98.05825894
## O 0.16993982 17.82042643 0.94411013 99.00236907
## P 0.11401231 17.93443874 0.63340172 99.63577079
## Q 0.05056338 17.98500212 0.28090767 99.91667845
## R 0.01499788 18.00000000 0.08332155 100.00000000
Nine comes sooner than eight here!
iran[,40:62]->ir_Q8
ir_Q8=as.data.frame(t(as.matrix(ir_Q8)))
ir_Q8$growth=iran_GDP[36:58,1]
library(h2o)
h2o.init()
## Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 18 minutes 28 seconds
## H2O cluster version: 3.16.0.2
## H2O cluster version age: 7 months and 13 days !!!
## H2O cluster name: H2O_started_from_R_Tara_tqw710
## H2O cluster total nodes: 1
## H2O cluster total memory: 3.32 GB
## H2O cluster total cores: 8
## H2O cluster allowed cores: 8
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## H2O API Extensions: XGBoost, Algos, AutoML, Core V3, Core V4
## R Version: R version 3.4.3 (2017-11-30)
happly = as.h2o(ir_Q8)
##
|
| | 0%
|
|=================================================================| 100%
hglm = h2o.glm(y = "growth", x= c("V1","V2","V3","V4","V5","V6",
"V7","V8","V9","V10","V11","V12",
"V13","V14","V15","V16","V17","V18"),
training_frame = happly, family="gaussian")
##
|
| | 0%
|
|=================================================================| 100%
hglm
## Model Details:
## ==============
##
## H2ORegressionModel: glm
## Model ID: GLM_model_R_1531547753200_2
## GLM Model: summary
## family link regularization
## 1 gaussian identity Elastic Net (alpha = 0.5, lambda = 0.1456 )
## number_of_predictors_total number_of_active_predictors
## 1 17 3
## number_of_iterations training_frame
## 1 1 ir_Q8
##
## Coefficients: glm coefficients
## names coefficients standardized_coefficients
## 1 Intercept 2.072005 5.551278
## 2 V1 0.000000 0.000000
## 3 V2 -0.001817 -0.003502
## 4 V3 0.000000 0.000000
## 5 V4 0.000000 0.000000
## 6 V5 0.000000 0.000000
## 7 V6 0.000000 0.000000
## 8 V7 0.000000 0.000000
## 9 V8 0.000000 0.000000
## 10 V9 0.000000 0.000000
## 11 V10 0.000000 0.000000
## 12 V11 0.001499 0.662405
## 13 V12 0.000000 0.000000
## 14 V13 -0.000192 -0.001890
## 15 V14 0.000000 0.000000
## 16 V15 0.000000 0.000000
## 17 V17 0.000000 0.000000
## 18 V18 0.000000 0.000000
##
## H2ORegressionMetrics: glm
## ** Reported on training data. **
##
## MSE: 0.04870777
## RMSE: 0.2206984
## MAE: 0.1725157
## RMSLE: 0.03317761
## Mean Residual Deviance : 0.04870777
## R^2 : 0.9238002
## Null Deviance :14.06265
## Null D.o.F. :21
## Residual Deviance :1.071571
## Residual D.o.F. :18
## AIC :5.951126
Most of countries are in one cluster, and only a small portion of them are in other 2.
# Q9
health=a[sort(c(57,139,51,479,332,646,775,1235,1251,1078,1323,1131)),]
edu=a[sort(c(572,574,759,1260,1356,1350,1328,1329,1259,1260,1288,1289,598,599)),]
Health related criteria are:
health
## [1] Adolescent fertility rate (births per 1,000 women ages 15-19)
## [2] Age dependency ratio (% of working-age population)
## [3] Birth rate, crude (per 1,000 people)
## [4] Death rate, crude (per 1,000 people)
## [5] Fertility rate, total (births per woman)
## [6] Immunization, DPT (% of children ages 12-23 months)
## [7] Life expectancy at birth, total (years)
## [8] Population ages 0-14 (% of total)
## [9] Population growth (annual %)
## [10] Prevalence of anemia among children (% of children under 5)
## [11] Prevalence of undernourishment (% of population)
## [12] Refugee population by country or territory of origin
## 1591 Levels: 2005 PPP conversion factor, GDP (LCU per international $) ...
Education related criteria are:
edu
## [1] Government expenditure on education, total (% of GDP)
## [2] Government expenditure per student, primary (% of GDP per capita)
## [3] Gross intake ratio in first grade of primary education, female (% of relevant age group)
## [4] Gross intake ratio in first grade of primary education, male (% of relevant age group)
## [5] Labor force, total
## [6] Primary completion rate, female (% of relevant age group)
## [7] Primary completion rate, male (% of relevant age group)
## [8] Primary completion rate, male (% of relevant age group)
## [9] Progression to secondary school, female (%)
## [10] Progression to secondary school, male (%)
## [11] Repeaters, primary, female (% of female enrollment)
## [12] Repeaters, primary, male (% of male enrollment)
## [13] School enrollment, preprimary (% gross)
## [14] School enrollment, primary and secondary (gross), gender parity index (GPI)
## 1591 Levels: 2005 PPP conversion factor, GDP (LCU per international $) ...
# health
### 5
as.data.frame(health)->health
WDIData%>%filter(`Country Name`=="Iran, Islamic Rep." )%>%
filter(`Indicator Name`%in%health$health)%>%arrange(`Indicator Name`)->iran
WDIData%>%filter(`Country Name`=="World")%>%
filter(`Indicator Name`%in%health$health)%>%arrange(`Indicator Name`)->WLD
lst2 = list()
for (i in 1:12) {
data.frame(value=as.numeric(iran[i,5:62]),year=1960:2017)->x
data.frame(value=as.numeric(WLD[i,5:62]),year=1960:2017)->x1
hchart(x,"line", hcaes(y = value, x = year),name= concatenate("IRR ",as.character(iran[i,3])))%>%
hc_add_series(x1, "spline", hcaes(y = value, x = year), name = concatenate("WLD ",as.character(iran[i,3])))%>%
hc_title(text =as.character(iran[i,3]))%>%
hc_add_theme(hc_theme_sandsignika())->hc1
lst2[[i]] <- hc1
}
htmltools::tagList(lst2)
### Q6
WDIData%>%filter(`Indicator Name`%in% health$health)%>%arrange(`Country Code`,`Indicator Name`)%>%
select(code=`Country Code`,data=`2011`,`Indicator Name`)%>%group_by(code)%>%
mutate(indicator=row_number(code))%>%select(code,indicator,data)->q6
codeList=unique(q6$code)
#264
q6%>%filter(code==codeList[1])%>%arrange(indicator)->x
w=t(as.table(x$data))
for (i in 2:264) {
q6%>%filter(code==codeList[i])%>%arrange(indicator)->x
w1=t(as.table(x$data))
w=rbind(w,w1)
}
w=as.data.frame(w)
Q6_data=cbind(as.data.frame(codeList),w)%>%na.omit()
kcl = kmeans(Q6_data[,-1],centers = 3)
kcl
## K-means clustering with 3 clusters of sizes 8, 186, 14
##
## Cluster means:
## A B C D E F G H
## 1 83.48074 73.73819 30.28923 8.698515 3.954039 79.48215 64.14677 36.91023
## 2 50.83986 58.42074 21.28926 8.125110 2.759095 89.74555 70.98903 27.89793
## 3 64.82178 67.37208 27.69822 7.864780 3.584872 81.67857 66.15822 34.59174
## I J K L
## 1 2.084558 54.53991 18.32039 7825597.88
## 2 1.346483 34.67623 11.03765 87349.99
## 3 1.986412 48.40044 15.10230 3115756.00
##
## Clustering vector:
## 2 3 4 6 7 8 9 11 12 13 14 16 17 18 19 20 22 23
## 3 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## 24 25 27 28 29 30 32 33 34 35 36 38 39 40 41 43 44 46
## 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## 47 48 49 52 53 54 55 57 58 59 60 61 62 63 64 65 66 67
## 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## 69 70 71 72 74 75 76 79 80 81 82 84 85 86 88 89 91 93
## 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## 94 96 97 99 100 101 102 103 104 105 106 108 110 111 112 113 114 115
## 2 2 1 2 2 3 1 1 2 2 1 2 2 2 2 2 2 2
## 116 117 118 119 120 121 122 123 125 126 127 128 129 130 132 133 134 135
## 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 3
## 137 138 139 140 141 142 144 147 149 150 151 152 153 155 156 157 158 159
## 2 3 1 2 2 2 2 2 2 2 2 3 2 3 2 2 2 2
## 160 161 162 164 165 166 167 168 169 170 172 173 174 175 176 177 179 180
## 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## 181 182 183 184 185 186 189 190 192 193 194 196 197 200 201 202 203 204
## 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 3 2
## 206 208 209 210 213 214 216 217 218 219 220 221 222 223 228 229 230 231
## 2 2 2 2 2 3 3 2 2 2 2 2 2 2 2 2 2 2
## 232 233 234 235 236 237 239 240 241 242 243 245 246 247 248 249 250 251
## 2 2 2 2 2 2 3 3 2 2 2 2 2 2 3 2 2 2
## 252 253 256 257 258 259 261 262 263 264
## 2 2 2 2 1 2 2 2 2 2
##
## Within cluster sum of squares by cluster:
## [1] 2.504311e+13 1.245033e+13 1.335006e+13
## (between_SS / total_SS = 91.6 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
plot(Q6_data,col = kcl$cluster,pch = 20,cex = 2)
Q6=data.frame(name=Q6_data[,1],cluster=kcl$cluster)
ir_cls_num=Q6$cluster[which(as.character(Q6$name)=="IRN")]
Q6%>%filter(cluster==ir_cls_num)->IRN_clus
WDIData%>%filter(`Country Code`%in% IRN_clus$name)%>%select(`Country Name`)%>%unique()->IRN_clus_names
print(IRN_clus_names)
## # A tibble: 186 x 1
## `Country Name`
## <chr>
## 1 Caribbean small states
## 2 Central Europe and the Baltics
## 3 Early-demographic dividend
## 4 East Asia & Pacific
## 5 East Asia & Pacific (excluding high income)
## 6 East Asia & Pacific (IDA & IBRD countries)
## 7 Euro area
## 8 Europe & Central Asia
## 9 Europe & Central Asia (excluding high income)
## 10 Europe & Central Asia (IDA & IBRD countries)
## # ... with 176 more rows
### Q7
Q7_pca = prcomp(Q6_data[,-1], scale. = TRUE)
ggbiplot(Q7_pca, obs.scale = 1, var.scale = 1,
groups =as.character( kcl$cluster), ellipse = TRUE, circle = F) +
scale_color_discrete(name = '') +
theme(legend.direction = 'horizontal', legend.position = 'top')
PCA = Q7_pca$sdev^2
#names(PCA) = paste0('PC', eigv$PCs)
qcc::pareto.chart(PCA)
##
## Pareto chart analysis for PCA
## Frequency Cum.Freq. Percentage Cum.Percent.
## A 7.688220e+00 7.688220e+00 6.406850e+01 6.406850e+01
## B 1.421548e+00 9.109767e+00 1.184623e+01 7.591473e+01
## C 9.770108e-01 1.008678e+01 8.141757e+00 8.405648e+01
## D 5.668615e-01 1.065364e+01 4.723846e+00 8.878033e+01
## E 4.602102e-01 1.111385e+01 3.835085e+00 9.261542e+01
## F 3.189776e-01 1.143283e+01 2.658146e+00 9.527356e+01
## G 2.401360e-01 1.167296e+01 2.001134e+00 9.727470e+01
## H 1.854594e-01 1.185842e+01 1.545495e+00 9.882019e+01
## I 9.714883e-02 1.195557e+01 8.095736e-01 9.962976e+01
## J 3.388960e-02 1.198946e+01 2.824133e-01 9.991218e+01
## K 6.965037e-03 1.199643e+01 5.804198e-02 9.997022e+01
## L 3.573705e-03 1.200000e+01 2.978087e-02 1.000000e+02
#####################################################################
#####################################################################
# EDU
### 5
as.data.frame(edu)->edu
WDIData%>%filter(`Country Name`=="Iran, Islamic Rep." )%>%
filter(`Indicator Name`%in%edu$edu)%>%arrange(`Indicator Name`)->iran
WDIData%>%filter(`Country Name`=="World")%>%
filter(`Indicator Name`%in%edu$edu)%>%arrange(`Indicator Name`)->WLD
lst3 = list()
for (i in 1:13) {
data.frame(value=as.numeric(iran[i,5:62]),year=1960:2017)->x
data.frame(value=as.numeric(WLD[i,5:62]),year=1960:2017)->x1
hchart(x,"line", hcaes(y = value, x = year),name= concatenate("IRR ",as.character(iran[i,3])))%>%
hc_add_series(x1, "spline", hcaes(y = value, x = year), name = concatenate("WLD ",as.character(iran[i,3])))%>%
hc_title(text =as.character(iran[i,3]))%>%
hc_add_theme(hc_theme_sandsignika())->hc1
lst3[[i]] <- hc1
}
htmltools::tagList(lst3)
### Q6
WDIData%>%filter(`Indicator Name`%in% edu$edu)%>%arrange(`Country Code`,`Indicator Name`)%>%
select(code=`Country Code`,data=`2011`,`Indicator Name`)%>%group_by(code)%>%
mutate(indicator=row_number(code))%>%select(code,indicator,data)->q6
codeList=unique(q6$code)
#264
q6%>%filter(code==codeList[1])%>%arrange(indicator)->x
w=t(as.table(x$data))
for (i in 2:264) {
q6%>%filter(code==codeList[i])%>%arrange(indicator)->x
w1=t(as.table(x$data))
w=rbind(w,w1)
}
w=as.data.frame(w)
Q6_data=cbind(as.data.frame(codeList),w)%>%na.omit()
kcl = kmeans(Q6_data[,-1],centers = 3)
kcl
## K-means clustering with 3 clusters of sizes 59, 4, 12
##
## Cluster means:
## A B C D E F G
## 1 4.893564 17.48515 103.0388 105.5061 41595917 90.22627 91.36756
## 2 4.192131 13.70823 107.4102 109.5405 2653954658 90.19385 91.90959
## 3 3.963129 12.56628 108.3180 113.3255 591662012 80.24198 83.64485
## H I J K L M
## 1 93.20675 93.81361 5.064084 5.359394 65.86780 0.9761970
## 2 91.83340 92.02502 4.847448 4.359955 37.84992 0.9751325
## 3 86.53433 87.01977 6.259462 5.736092 36.98487 0.9469842
##
## Clustering vector:
## 8 13 15 20 27 31 35 36 38 41 44 46 47 48 52 53 57 64
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3
## 65 67 70 71 72 74 82 91 97 101 102 103 106 111 113 114 115 127
## 1 1 1 1 1 1 1 1 1 2 2 3 3 1 1 1 1 1
## 128 133 134 135 137 139 141 142 144 149 153 158 166 172 180 182 184 185
## 1 1 3 1 1 2 3 1 1 1 1 1 1 1 3 1 1 1
## 190 194 200 203 206 209 213 214 216 217 220 221 222 223 228 235 239 240
## 1 1 1 3 1 1 1 3 3 1 1 1 1 1 1 1 3 3
## 247 248 258
## 1 3 2
##
## Within cluster sum of squares by cluster:
## [1] 4.249520e+17 6.115389e+17 1.210090e+18
## (between_SS / total_SS = 92.3 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
plot(Q6_data,col = kcl$cluster,pch = 20,cex = 2)
Q6=data.frame(name=Q6_data[,1],cluster=kcl$cluster)
ir_cls_num=Q6$cluster[which(as.character(Q6$name)=="IRN")]
Q6%>%filter(cluster==ir_cls_num)->IRN_clus
WDIData%>%filter(`Country Code`%in% IRN_clus$name)%>%select(`Country Name`)%>%unique()->IRN_clus_names
print(IRN_clus_names)
## # A tibble: 59 x 1
## `Country Name`
## <chr>
## 1 Caribbean small states
## 2 Central Europe and the Baltics
## 3 Euro area
## 4 European Union
## 5 Heavily indebted poor countries (HIPC)
## 6 Latin America & Caribbean
## 7 Latin America & Caribbean (excluding high income)
## 8 Latin America & the Caribbean (IDA & IBRD countries)
## 9 Low income
## 10 Other small states
## # ... with 49 more rows
### Q7
Q7_pca = prcomp(Q6_data[,-1], scale. = TRUE)
ggbiplot(Q7_pca, obs.scale = 1, var.scale = 1,
groups =as.character( kcl$cluster), ellipse = TRUE, circle = F) +
scale_color_discrete(name = '') +
theme(legend.direction = 'horizontal', legend.position = 'top')
PCA = Q7_pca$sdev^2
#names(PCA) = paste0('PC', eigv$PCs)
qcc::pareto.chart(PCA)
##
## Pareto chart analysis for PCA
## Frequency Cum.Freq. Percentage Cum.Percent.
## A 7.370078e+00 7.370078e+00 5.669291e+01 5.669291e+01
## B 1.410093e+00 8.780171e+00 1.084687e+01 6.753978e+01
## C 1.173857e+00 9.954028e+00 9.029672e+00 7.656945e+01
## D 9.567394e-01 1.091077e+01 7.359534e+00 8.392898e+01
## E 6.604114e-01 1.157118e+01 5.080088e+00 8.900907e+01
## F 4.969341e-01 1.206811e+01 3.822570e+00 9.283164e+01
## G 3.541847e-01 1.242230e+01 2.724498e+00 9.555614e+01
## H 3.114850e-01 1.273378e+01 2.396038e+00 9.795217e+01
## I 1.629921e-01 1.289677e+01 1.253785e+00 9.920596e+01
## J 4.951025e-02 1.294629e+01 3.808481e-01 9.958681e+01
## K 3.495632e-02 1.298124e+01 2.688948e-01 9.985570e+01
## L 1.497222e-02 1.299621e+01 1.151709e-01 9.997087e+01
## M 3.786438e-03 1.300000e+01 2.912644e-02 1.000000e+02
as.data.frame(c(as.character(health[,1]),as.character(edu[,1]),as.character(index[,1])))%>%unique()->all
names(all)="all"
WDIData%>%filter(`Indicator Name`%in% all$all)%>%arrange(`Country Code`,`Indicator Name`)%>%
select(code=`Country Code`,data=`2011`,`Indicator Name`)%>%group_by(code)%>%
mutate(indicator=row_number(code))%>%select(code,indicator,data)->q10
codeList=unique(q10$code)
#264
q10%>%filter(code==codeList[1])%>%arrange(indicator)->x
w=t(as.table(x$data))
for (i in 2:264) {
q10%>%filter(code==codeList[i])%>%arrange(indicator)->x
w1=t(as.table(x$data))
w=rbind(w,w1)
}
w=as.data.frame(w)
Q10_data=cbind(as.data.frame(codeList),w)
Q10_data=Q10_data[,-c(25,28)]
Q10_data=Q10_data%>%na.omit()
row.names(Q10_data)=Q10_data$codeList
Q10_data=Q10_data[,-1]
dist = stats::dist(Q10_data,method = "euclidean")
clus = hclust(dist,method = "complete")
plot(clus)
rect.hclust(clus, 3)
Prevalence of anemia among children has constantly decresed in iran. it was as high as 50% just 30 years ago, and is currently less than 30%. Switzerland is one of the ,ost developed and wealthiest countries. Its rate is substantially lower. However, since 2010 the Prevalence of anemia has increased by 3% which is realy odd for such a country!
#"Prevalence of anemia among children"
WDIData%>%filter(`Country Name`=="Iran, Islamic Rep." )%>%
filter(`Indicator Name`==a[1235,])->q11_1_ir
q11_1_ir=data.frame(data=as.numeric(q11_1_ir[,5:62]),year=1960:2017)
WDIData%>%filter(`Country Name`=="Switzerland" )%>%
filter(`Indicator Name`==a[1235,])->q11_1_ch
q11_1_ch=data.frame(data=as.numeric(q11_1_ch[,5:62]),year=1960:2017)
WDIData%>%filter( `Country Name`=="World" )%>%
filter(`Indicator Name`==a[1235,])->q11_1_wld
q11_1_wld=data.frame(data=as.numeric(q11_1_wld[,5:62]),year=1960:2017)
hchart(q11_1_ir,"line", hcaes(y = data, x = year),name="IRR")%>%
hc_add_series(q11_1_wld, "spline", hcaes(y = data, x = year), name ="WLD")%>%
hc_add_series(q11_1_ch, "spline", hcaes(y = data, x = year), name ="CHF")%>%
hc_title(text =a[1235,])%>%
hc_add_theme(hc_theme_sandsignika())
The immunization against measles rate for Iran is one of the highest. Sunprisingly, the immunization rate for US and switzerland are lower than Iran! I googled it and found that it is not a compulsory vaccination in those two country. In addition to that, there are some debates whether measles vaccine is related to autism in the next generation.
“Measles vaccination is an important indicator of hesitancy and refusal, since misinformation accusing it of causing autism, despite evidence of its fraudulent source, is still going on.”
If you google the autism rate, it is 1.9% for Iran and 1.7% for US. Due to the fact that there are probably many austism cases in Iran that have not been counted, I wonder whether such speculation about measles vaccine can be proved true in future!
WDIData%>%filter(`Country Name`=="Iran, Islamic Rep." )%>%
filter(`Indicator Name`==a[648,])->q11_2_ir
q11_2_ir=data.frame(data=as.numeric(q11_2_ir[,5:62]),year=1960:2017)
WDIData%>%filter(`Country Name`=="Switzerland" )%>%
filter(`Indicator Name`==a[648,])->q11_2_ch
q11_2_ch=data.frame(data=as.numeric(q11_2_ch[,5:62]),year=1960:2017)
WDIData%>%filter( `Country Name`=="United States" )%>%
filter(`Indicator Name`==a[648,])->q11_2_US
q11_2_US=data.frame(data=as.numeric(q11_2_US[,5:62]),year=1960:2017)
hchart(q11_2_ir,"line", hcaes(y = data, x = year),name="IRR")%>%
hc_add_series(q11_2_US, "spline", hcaes(y = data, x = year), name ="US")%>%
hc_add_series(q11_2_ch, "spline", hcaes(y = data, x = year), name ="CHF")%>%
hc_title(text =a[648,])%>%
hc_add_theme(hc_theme_sandsignika())
Ratio of girls to boys is “gender parity index”. Iran’s index has the highest growth and is now above 1. US’s index has been constantly above 1, but is now a little lower than Iran, I believe such a large growth is a sign of modernization of society in a developing country.
WDIData%>%filter(`Country Name`=="Iran, Islamic Rep." )%>%
filter(`Indicator Name`==a[1356,])->q11_3_ir
q11_3_ir=data.frame(data=as.numeric(q11_3_ir[,5:62]),year=1960:2017)
WDIData%>%filter(`Country Name`=="Switzerland" )%>%
filter(`Indicator Name`==a[1356,])->q11_3_ch
q11_3_ch=data.frame(data=as.numeric(q11_3_ch[,5:62]),year=1960:2017)
WDIData%>%filter( `Country Name`=="United States" )%>%
filter(`Indicator Name`==a[1356,])->q11_3_US
q11_3_US=data.frame(data=as.numeric(q11_3_US[,5:62]),year=1960:2017)
hchart(q11_3_ir,"line", hcaes(y = data, x = year),name="IRR")%>%
hc_add_series(q11_3_US, "spline", hcaes(y = data, x = year), name ="US")%>%
hc_add_series(q11_3_ch, "spline", hcaes(y = data, x = year), name ="CHF")%>%
hc_title(text =a[1356,])%>%
hc_add_theme(hc_theme_sandsignika())